Lesson 6


Scatterplot Review

data("diamonds")
## Warning in data("diamonds"): data set 'diamonds' not found
library(ggplot2)

ggplot(data = diamonds, mapping = aes(x = carat, y = price)) + 
  scale_x_continuous(limits = c(0,quantile(diamonds$carat,0.99))) +
  scale_y_continuous(limits = c(0,quantile(diamonds$price,0.99))) +
  geom_point(na.rm = TRUE,color = 'black', fill = 'orange', shape = 21)


Price and Carat Relationship

Response: Non-linear looks more exponential


ggplot(data = diamonds, mapping = aes(x = carat, y = price)) + 
  scale_x_continuous(limits = c(0,quantile(diamonds$carat,0.99))) +
  scale_y_continuous(limits = c(0,quantile(diamonds$price,0.99))) +
  geom_point(na.rm = TRUE) + geom_smooth(method = 'lm')

A diamond is FOREVER


ggpairs Function

# install these if necessary

# install.packages('GGally')
#install.packages('scales')
#install.packages('memisc')
# install.packages('lattice')
# install.packages('MASS')
# install.packages('car')
# install.packages('reshape')
# install.packages('plyr')

# load the ggplot graphics package and the others
library(ggplot2)
library(GGally)
library(scales)
library(memisc)

# sample 10,000 diamonds from the data set
set.seed(20022012)
diamond_samp <- diamonds[sample(1:length(diamonds$price), 10000), ]
ggpairs(diamond_samp,
  lower = list(continuous = wrap("points", shape = I('.'))),
  upper = list(combo = wrap("box", outlier.shape = I('.'))))


The Demand of Diamonds

library(gridExtra)

plot1 <- ggplot(data = diamonds, mapping = aes(x = price)) + 
  geom_histogram() +
  ggtitle('Price')

plot2 <- plot1 + scale_x_log10() + ggtitle('Price (log10)')

grid.arrange(plot1,plot2)


Scatterplot Transformation

ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
  geom_point() + 
  scale_y_log10()

Create a new function to transform the carat variable

cuberoot_trans = function() trans_new('cuberoot', transform = function(x) x^(1/3),
                                      inverse = function(x) x^3)

Use the cuberoot_trans function

ggplot(aes(carat, price), data = diamonds) + 
  geom_point(na.rm = TRUE) + 
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')


Overplotting Revisited

head(sort(table(diamonds$carat),decreasing = T))
## 
##  0.3 0.31 1.01  0.7 0.32    1 
## 2604 2249 2242 1981 1840 1558
head(sort(table(diamonds$price),decreasing = T))
## 
## 605 802 625 828 776 698 
## 132 127 126 125 124 121
ggplot(aes(carat, price), data = diamonds) + 
  geom_point(na.rm = TRUE, alpha = 1/20, size = 3/4, position = 'jitter') + 
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')


Other Qualitative Factors

Notes: Clarity is another feature which might be useful


Price vs. Carat and Clarity

Alter the code below.

# install and load the RColorBrewer package
# install.packages('RColorBrewer')
library(RColorBrewer)

ggplot(aes(x = carat, y = price), data = diamonds) + 
  geom_point(na.rm = TRUE, alpha = 0.5, size = 1, position = 'jitter',
             mapping = aes(color = clarity)) +
  scale_color_brewer(type = 'div',
    guide = guide_legend(title = 'Clarity', reverse = T,
    override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
    breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
    breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Clarity')


Clarity and Price

Response: Price does vary by clarity, diamonds with higher clarity have higher prices


Price vs. Carat and Cut

Alter the code below.

ggplot(aes(x = carat, y = price, color = clarity), data = diamonds) + 
  geom_point(na.rm = TRUE, alpha = 0.5, size = 1, position = 'jitter',
             mapping = aes(color = cut)) +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Clarity', reverse = T,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Clarity')


Cut and Price

Response: Not much variation is explained by coloring using the cut of the diamonds


Price vs. Carat and Color

Alter the code below.

ggplot(data = diamonds, aes(x = carat, y = price, color = color)) + 
  geom_point(na.rm = TRUE, alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Color', reverse = T,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Cut')


Linear Models in R


Building the Linear Model

m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamonds)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
## 
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = diamonds)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = diamonds)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamonds)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color, 
##     data = diamonds)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color + 
##     clarity, data = diamonds)
## 
## ========================================================================================
##                        m1            m2            m3            m4            m5       
## ----------------------------------------------------------------------------------------
##   (Intercept)         2.821***      1.039***      0.874***      0.932***      0.415***  
##                      (0.006)       (0.019)       (0.019)       (0.017)       (0.010)    
##   I(carat^(1/3))      5.558***      8.568***      8.703***      8.438***      9.144***  
##                      (0.007)       (0.032)       (0.031)       (0.028)       (0.016)    
##   carat                            -1.137***     -1.163***     -0.992***     -1.093***  
##                                    (0.012)       (0.011)       (0.010)       (0.006)    
##   cut: .L                                         0.224***      0.224***      0.120***  
##                                                  (0.004)       (0.004)       (0.002)    
##   cut: .Q                                        -0.062***     -0.062***     -0.031***  
##                                                  (0.004)       (0.003)       (0.002)    
##   cut: .C                                         0.051***      0.052***      0.014***  
##                                                  (0.003)       (0.003)       (0.002)    
##   cut: ^4                                         0.018***      0.018***     -0.002     
##                                                  (0.003)       (0.002)       (0.001)    
##   color: .L                                                    -0.373***     -0.441***  
##                                                                (0.003)       (0.002)    
##   color: .Q                                                    -0.129***     -0.093***  
##                                                                (0.003)       (0.002)    
##   color: .C                                                     0.001        -0.013***  
##                                                                (0.003)       (0.002)    
##   color: ^4                                                     0.029***      0.012***  
##                                                                (0.003)       (0.002)    
##   color: ^5                                                    -0.016***     -0.003*    
##                                                                (0.003)       (0.001)    
##   color: ^6                                                    -0.023***      0.001     
##                                                                (0.002)       (0.001)    
##   clarity: .L                                                                 0.907***  
##                                                                              (0.003)    
##   clarity: .Q                                                                -0.240***  
##                                                                              (0.003)    
##   clarity: .C                                                                 0.131***  
##                                                                              (0.003)    
##   clarity: ^4                                                                -0.063***  
##                                                                              (0.002)    
##   clarity: ^5                                                                 0.026***  
##                                                                              (0.002)    
##   clarity: ^6                                                                -0.002     
##                                                                              (0.002)    
##   clarity: ^7                                                                 0.032***  
##                                                                              (0.001)    
## ----------------------------------------------------------------------------------------
##   R-squared           0.924         0.935         0.939         0.951         0.984     
##   N               53940         53940         53940         53940         53940         
## ========================================================================================
##   Significance: *** = p < 0.001; ** = p < 0.01; * = p < 0.05

Notice how adding cut to our model does not help explain much of the variance in the price of diamonds. This fits with out exploration earlier.


Model Problems

Research: (Take some time to come up with 2-4 problems for the model) (You should 10-20 min on this)


A Bigger, Better Data Set

#install.package('bitops') # throws an error
#install.packages('RCurl')
#library('bitops')
library('RCurl')
## Loading required package: bitops
#diamondsurl = getBinaryURL("https://raw.github.com/solomonm/diamonds-data/master/BigDiamonds.Rda")
#load(rawConnection(diamondsurl))
load('./BigDiamonds.Rda')
names(diamondsbig)
##  [1] "carat"        "cut"          "color"        "clarity"     
##  [5] "table"        "depth"        "cert"         "measurements"
##  [9] "price"        "x"            "y"            "z"

The code used to obtain the data is available here: https://github.com/solomonm/diamonds-data

Building a Model Using the Big Diamonds Data Set

Notes:

m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamondsbig)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)

suppressMessages(library(lattice))
suppressMessages(library(MASS))
suppressMessages(library(memisc))
mtable(m1, m2, m3, m4, m5)
## 
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = diamondsbig)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = diamondsbig)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamondsbig)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color, 
##     data = diamondsbig)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color + 
##     clarity, data = diamondsbig)
## 
## =============================================================================================
##                        m1             m2             m3             m4             m5        
## ---------------------------------------------------------------------------------------------
##   (Intercept)          3.096***       1.406***       1.218***       0.405***      -0.663***  
##                       (0.002)        (0.005)        (0.005)        (0.006)        (0.006)    
##   I(carat^(1/3))       5.317***       7.911***       7.920***       8.170***       8.368***  
##                       (0.002)        (0.008)        (0.008)        (0.007)        (0.005)    
##   carat                              -0.767***      -0.779***      -0.782***      -0.815***  
##                                      (0.002)        (0.002)        (0.002)        (0.001)    
##   cut: V.Good                                        0.119***       0.092***       0.059***  
##                                                     (0.002)        (0.002)        (0.001)    
##   cut: Ideal                                         0.256***       0.222***       0.130***  
##                                                     (0.002)        (0.001)        (0.001)    
##   color: K/L                                                        0.134***       0.128***  
##                                                                    (0.004)        (0.003)    
##   color: J/L                                                        0.302***       0.325***  
##                                                                    (0.004)        (0.003)    
##   color: I/L                                                        0.422***       0.457***  
##                                                                    (0.003)        (0.003)    
##   color: H/L                                                        0.517***       0.560***  
##                                                                    (0.003)        (0.003)    
##   color: G/L                                                        0.627***       0.661***  
##                                                                    (0.003)        (0.002)    
##   color: F/L                                                        0.723***       0.751***  
##                                                                    (0.003)        (0.002)    
##   color: E/L                                                        0.790***       0.805***  
##                                                                    (0.003)        (0.002)    
##   color: D/L                                                        0.894***       0.886***  
##                                                                    (0.003)        (0.003)    
##   clarity: I1                                                                      0.355***  
##                                                                                   (0.005)    
##   clarity: SI2                                                                     0.684***  
##                                                                                   (0.005)    
##   clarity: SI1                                                                     0.834***  
##                                                                                   (0.005)    
##   clarity: VS2                                                                     0.979***  
##                                                                                   (0.005)    
##   clarity: VS1                                                                     1.067***  
##                                                                                   (0.005)    
##   clarity: VVS2                                                                    1.145***  
##                                                                                   (0.005)    
##   clarity: VVS1                                                                    1.224***  
##                                                                                   (0.005)    
##   clarity: IF                                                                      1.346***  
##                                                                                   (0.005)    
## ---------------------------------------------------------------------------------------------
##   R-squared            0.893          0.911          0.915          0.940          0.968     
##   N               597311         597311         597311         597311         597311         
## =============================================================================================
##   Significance: *** = p < 0.001; ** = p < 0.01; * = p < 0.05

Predictions

Example Diamond from BlueNile: Round 1.00 Very Good I VS1 $5,601

# Be sure you’ve loaded the library memisc and have m5 saved as an object in your workspace.
thisDiamond = data.frame(carat = 1.00, cut = "V.Good",
                         color = "I", clarity="VS1")
modelEstimate = predict(m5, newdata = thisDiamond,
                        interval="prediction", level = .95)
dat = data.frame(m4$model, m4$residuals)

with(dat, sd(m4.residuals))
## [1] 0.3176132
with(subset(dat, carat > .9 & carat < 1.1), sd(m4.residuals))
## [1] 0.3668827
dat$resid <- as.numeric(dat$m4.residuals)
ggplot(aes(y = resid, x = round(carat, 2)), data = dat) +
  geom_line(stat = "summary", fun.y = sd)